{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib\n",
    "matplotlib.rcParams['figure.figsize'] = [8, 3]\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import statsmodels.api as sm\n",
    "import statsmodels\n",
    "\n",
    "import scipy\n",
    "from scipy.stats import pearsonr\n",
    "\n",
    "from pandas.plotting import register_matplotlib_converters\n",
    "register_matplotlib_converters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(matplotlib.__version__)\n",
    "print(pd.__version__)\n",
    "print(np.__version__)\n",
    "print(statsmodels.__version__)\n",
    "print(scipy.__version__)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Obtain and visualize data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## data obtained from https://datahub.io/core/global-temp#data\n",
    "df = pd.read_csv(\"global_temps.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.Mean[:100].plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise: what is wrong with the data and plot above? How can we fix this?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " df = df.pivot(index='Date', columns='Source', values='Mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.GCAG.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(df.index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise: how can we make the index more time aware?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.index = pd.to_datetime(df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.GCAG.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['1880']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(df['1880':'1950'][['GCAG', 'GISTEMP']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(df['1950':][['GISTEMP']])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise: How strongly do these measurements correlate contemporaneously? What about with a time lag?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(df['1880':'1900'][['GCAG']], df['1880':'1900'][['GISTEMP']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(df['1880':'1899'][['GCAG']], df['1881':'1900'][['GISTEMP']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pearsonr(df['1880':'1899'].GCAG, df['1881':'1900'].GISTEMP)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['1880':'1899'][['GCAG']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['1881':'1900'][['GISTEMP']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "min(df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max(df.index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Unobserved component model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = df['1960':]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### model parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# smooth trend model without seasonal or cyclical components\n",
    "model = {\n",
    "    'level': 'smooth trend', 'cycle': False, 'seasonal': None, \n",
    "}\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### fitting a model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# https://www.statsmodels.org/dev/generated/statsmodels.tsa.statespace.structural.UnobservedComponents.html\n",
    "gcag_mod = sm.tsa.UnobservedComponents(train['GCAG'], **model)\n",
    "gcag_res = gcag_mod.fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = gcag_res.plot_components(legend_loc='lower right', figsize=(15, 9));"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plotting predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Perform rolling prediction and multistep forecast\n",
    "num_steps = 20\n",
    "predict_res = gcag_res.get_prediction(dynamic=train['GCAG'].shape[0] - num_steps)\n",
    "\n",
    "predict = predict_res.predicted_mean\n",
    "ci = predict_res.conf_int()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(train['GCAG'], predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "# Plot the results\n",
    "ax.plot(train['GCAG'], 'k.', label='Observations');\n",
    "ax.plot(train.index[:-num_steps], predict[:-num_steps], label='One-step-ahead Prediction');\n",
    "\n",
    "ax.plot(train.index[-num_steps:], predict[-num_steps:], 'r', label='Multistep Prediction');\n",
    "ax.plot(train.index[-num_steps:], ci.iloc[-num_steps:], 'k--');\n",
    "\n",
    "# Cleanup the image\n",
    "legend = ax.legend(loc='upper left');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "# Plot the results\n",
    "ax.plot(train.index[-40:], train['GCAG'][-40:], 'k.', label='Observations');\n",
    "ax.plot(train.index[-40:-num_steps], predict[-40:-num_steps], label='One-step-ahead Prediction');\n",
    "\n",
    "ax.plot(train.index[-num_steps:], predict[-num_steps:], 'r', label='Multistep Prediction');\n",
    "ax.plot(train.index[-num_steps:], ci.iloc[-num_steps:], 'k--');\n",
    "\n",
    "# Cleanup the image\n",
    "legend = ax.legend(loc='upper left');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise: consider adding a seasonal term for 12 periods for the model fit above. Does this improve the fit of the model?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "seasonal_model = {\n",
    "    'level': 'local linear trend',\n",
    "    'seasonal': 12\n",
    "}\n",
    "mod = sm.tsa.UnobservedComponents(train['GCAG'], **seasonal_model)\n",
    "res = mod.fit(method='powell', disp=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = res.plot_components(legend_loc='lower right', figsize=(15, 9));"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How does this compare to the original model?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pearsonr(gcag_res.predict(), train['GCAG'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.abs(gcag_res.predict() - train['GCAG']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.abs(res.predict() - train['GCAG']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore the seasonality more"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "seasonal_model = {\n",
    "    'level': 'local level',\n",
    "    'seasonal': 12\n",
    "}\n",
    "llmod = sm.tsa.UnobservedComponents(train['GCAG'], **seasonal_model)\n",
    "ll_level_res = llmod.fit(method='powell', disp=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = ll_level_res.plot_components(legend_loc='lower right', figsize=(15, 9));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.abs(ll_level_res.predict() - train['GCAG']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train[:48].GCAG.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise: a common null model for time series is to predict the value at time t-1 for the value at time t. How does such a model compare to the models we fit here?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Consider correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pearsonr(ll_level_res.predict(), train['GCAG'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pearsonr(train['GCAG'].iloc[:-1, ], train['GCAG'].iloc[1:, ])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What about mean absolute error?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.abs(ll_level_res.predict() - train['GCAG']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(np.abs(train['GCAG'].iloc[:-1, ].values, train['GCAG'].iloc[1:, ].values))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
